library(rtweet)
library(tidyverse)
# library(stringr)
# library(tm) # text mining
# library(SnowballC) # remove common word endings / Stemming
library(tidytext)
library(wordcloud2)
search_tweets
tweets_by_tweeter <- tweet_collection %>%
group_by(screen_name) %>%
mutate(line = row_number()) %>%
ungroup()
tweets_by_tweeter %>%
count(screen_name, sort = TRUE)
glimpse(tweets_by_tweeter)
Rows: 381
Columns: 91
$ user_id <chr> "729978319", "2842694434", "1109187936602472449", "26926933", "15919288...
$ status_id <chr> "1321172260313415680", "1321162496166690819", "1321161276408516610", "1...
$ created_at <dttm> 2020-10-27 19:29:47, 2020-10-27 18:50:59, 2020-10-27 18:46:08, 2020-10...
$ screen_name <chr> "DrDanielKolder", "kschwartz827", "chrisgb002000", "CoachDouglas21", "O...
$ text <chr> "@BluejayMBB @j5_twann @Alex_OC11 @BIGEASTMBB @marchmadness @cucoachmac...
$ source <chr> "Twitter for iPhone", "Twitter for iPhone", "Twitter for Android", "Twi...
$ display_text_width <dbl> 279, 43, 218, 22, 270, 50, 276, 259, 52, 63, 43, 177, 27, 103, 15, 101,...
$ reply_to_status_id <chr> "1320769690394832896", "1321140791490703361", NA, NA, NA, NA, NA, NA, N...
$ reply_to_user_id <chr> "325898875", "57422635", NA, NA, NA, NA, NA, NA, NA, NA, "202416362", "...
$ reply_to_screen_name <chr> "BluejayMBB", "MSU_Basketball", NA, NA, NA, NA, NA, NA, NA, NA, "marchm...
$ is_quote <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALS...
$ is_retweet <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ favorite_count <int> 1, 0, 0, 0, 2, 31, 19, 883, 1, 6, 0, 0, 1, 0, 0, 0, 7, 24, 9, 0, 0, 0, ...
$ retweet_count <int> 0, 0, 0, 0, 1, 3, 6, 207, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 1, 0, ...
$ quote_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ reply_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ hashtags <list> [NA, NA, <"LifetimeLonghorn", "OneShiningMoment", "MarchMadness", "Hoo...
$ symbols <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ urls_url <list> [NA, NA, NA, "twitter.com/marchmadness/s…", "osga.com/online_gaming_…"...
$ urls_t.co <list> [NA, NA, NA, "https://t.co/YqppSGtY7N", "https://t.co/LRDrCdovtf", NA,...
$ urls_expanded_url <list> [NA, NA, NA, "https://twitter.com/marchmadness/status/1318959234692927...
$ media_url <list> [NA, NA, "http://pbs.twimg.com/ext_tw_video_thumb/1247537418003046407/...
$ media_t.co <list> [NA, NA, "https://t.co/toBH5bKqyo", NA, NA, "https://t.co/pfRL2EOYSR",...
$ media_expanded_url <list> [NA, NA, "https://twitter.com/chrisgb002000/status/1247537446960525323...
$ media_type <list> [NA, NA, "photo", NA, NA, "photo", NA, "photo", NA, NA, NA, NA, NA, NA...
$ ext_media_url <list> [NA, NA, "http://pbs.twimg.com/ext_tw_video_thumb/1247537418003046407/...
$ ext_media_t.co <list> [NA, NA, "https://t.co/toBH5bKqyo", NA, NA, "https://t.co/pfRL2EOYSR",...
$ ext_media_expanded_url <list> [NA, NA, "https://twitter.com/chrisgb002000/status/1247537446960525323...
$ ext_media_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ mentions_user_id <list> [<"325898875", "3177384938", "2772222795", "1324557265", "202416362", ...
$ mentions_screen_name <list> [<"BluejayMBB", "j5_twann", "Alex_OC11", "BIGEASTMBB", "marchmadness",...
$ lang <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en",...
$ quoted_status_id <chr> NA, NA, NA, "1318959234692927488", NA, NA, NA, NA, "1321131895300804610...
$ quoted_text <chr> NA, NA, NA, "#KatzRankz - #WestAward Candidates, as heard on the #MM365...
$ quoted_created_at <dttm> NA, NA, NA, 2020-10-21 16:56:01, NA, NA, NA, NA, 2020-10-27 16:49:23, ...
$ quoted_source <chr> NA, NA, NA, "Twitter Web App", NA, NA, NA, NA, "Twitter Media Studio", ...
$ quoted_favorite_count <int> NA, NA, NA, 850, NA, NA, NA, NA, 31, 31, NA, NA, NA, NA, NA, NA, NA, 88...
$ quoted_retweet_count <int> NA, NA, NA, 158, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, NA, NA, NA, 207,...
$ quoted_user_id <chr> NA, NA, NA, "202416362", NA, NA, NA, NA, "202416362", "202416362", NA, ...
$ quoted_screen_name <chr> NA, NA, NA, "marchmadness", NA, NA, NA, NA, "marchmadness", "marchmadne...
$ quoted_name <chr> NA, NA, NA, "NCAA March Madness", NA, NA, NA, NA, "NCAA March Madness",...
$ quoted_followers_count <int> NA, NA, NA, 1418660, NA, NA, NA, NA, 1418660, 1418660, NA, NA, NA, NA, ...
$ quoted_friends_count <int> NA, NA, NA, 815, NA, NA, NA, NA, 815, 815, NA, NA, NA, NA, NA, NA, NA, ...
$ quoted_statuses_count <int> NA, NA, NA, 29870, NA, NA, NA, NA, 29870, 29870, NA, NA, NA, NA, NA, NA...
$ quoted_location <chr> NA, NA, NA, "", NA, NA, NA, NA, "", "", NA, NA, NA, NA, NA, NA, NA, "",...
$ quoted_description <chr> NA, NA, NA, "The official NCAA March Madness destination for all things...
$ quoted_verified <lgl> NA, NA, NA, TRUE, NA, NA, NA, NA, TRUE, TRUE, NA, NA, NA, NA, NA, NA, N...
$ retweet_status_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_text <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_created_at <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ retweet_source <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_favorite_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_retweet_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_user_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_screen_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_followers_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_friends_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_statuses_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_description <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_verified <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ place_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, "https://api.twitter.com/1.1/geo/id/5c0...
$ place_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Buford", NA, NA, NA, NA, NA, NA, NA, N...
$ place_full_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Buford, GA", NA, NA, NA, NA, NA, NA, N...
$ place_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, "city", NA, NA, NA, NA, NA, NA, NA, NA,...
$ country <chr> NA, NA, NA, NA, NA, NA, NA, NA, "United States", NA, NA, NA, NA, NA, NA...
$ country_code <chr> NA, NA, NA, NA, NA, NA, NA, NA, "US", NA, NA, NA, NA, NA, NA, NA, NA, N...
$ geo_coords <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>,...
$ coords_coords <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>,...
$ bbox_coords <list> [<NA, NA, NA, NA, NA, NA, NA, NA>, <NA, NA, NA, NA, NA, NA, NA, NA>, <...
$ status_url <chr> "https://twitter.com/DrDanielKolder/status/1321172260313415680", "https...
$ name <chr> "Dr. Daniel G. Kolder", "Kevin", "Chris Bennett (CB)", "CoachD", "OSGA"...
$ location <chr> "Camarillo, CA", "", "Clovis/Germany/Cali/ATX/WA", "Oklahoma City, OK",...
$ description <chr> "Southern California Plastic Surgeon", "Michigan fan, golfer, Daily Fan...
$ url <chr> "https://t.co/w4CTVrZdL9", NA, NA, NA, "https://t.co/EFlm6UqO6f", "http...
$ protected <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ followers_count <int> 1173, 5, 3390, 3814, 1732, 1418660, 1418660, 1418660, 4498, 1255, 308, ...
$ friends_count <int> 133, 92, 4244, 1878, 4146, 815, 815, 815, 941, 872, 251, 4833, 120, 554...
$ listed_count <int> 24, 0, 19, 43, 34, 3815, 3815, 3815, 18, 3, 0, 47, 2, 54, 7, 6807, 6807...
$ statuses_count <int> 1704, 286, 131418, 331327, 8712, 29870, 29870, 29870, 4189, 1916, 2300,...
$ favourites_count <int> 2351, 120, 331, 34345, 926, 1695, 1695, 1695, 1757, 18040, 4611, 57931,...
$ account_created_at <dttm> 2012-08-01 04:38:50, 2014-10-24 17:07:44, 2019-03-22 20:19:46, 2009-03...
$ verified <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALS...
$ profile_url <chr> "https://t.co/w4CTVrZdL9", NA, NA, NA, "https://t.co/EFlm6UqO6f", "http...
$ profile_expanded_url <chr> "http://www.pacificaplasticsurgery.com", NA, NA, NA, "http://www.osga.c...
$ account_lang <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ profile_banner_url <chr> "https://pbs.twimg.com/profile_banners/729978319/1518222766", NA, "http...
$ profile_background_url <chr> "http://abs.twimg.com/images/themes/theme1/bg.png", "http://abs.twimg.c...
$ profile_image_url <chr> "http://pbs.twimg.com/profile_images/714501386155782146/Ybmfn3B5_normal...
$ line <int> 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4,...
"Because we have kept text such as hashtags and usernames in the dataset, we can’t use a simple anti_join() to remove stop words. Instead, we can take the approach shown in the filter() line that uses str_detect() from the stringr package. – https://www.tidytextmining.com/twitter.html
tweets_tokenized <- tweets_by_tweeter %>%
select(text, screen_name, line) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
str_detect(word, "[a-z]"))
tweets_tokenized
frequency <- tweets_tokenized %>%
group_by(screen_name) %>%
count(word, sort = TRUE) %>%
left_join(tweets_tokenized %>%
group_by(screen_name) %>%
summarise(total = n())) %>%
mutate(freq = n/total)
`summarise()` ungrouping output (override with `.groups` argument)
Joining, by = "screen_name"
frequency
"This is a nice and tidy data frame but we would actually like to plot those frequencies on the x- and y-axes of a plot, so we will need to use spread() from tidyr make a differently shaped data frame. – https://www.tidytextmining.com/twitter.html
pivot_wider
frequency <- frequency %>%
select(screen_name, word, freq) %>%
pivot_wider(names_from = screen_name, values_from = freq)
frequency
# frequency %>%
# select(screen_name, word, freq) %>%
# spread(screen_name, freq)
ggplot(frequency, aes(CBBCent1, Adam_Bradford14)) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = scales::percent_format()) +
scale_y_log10(labels = scales::percent_format()) +
geom_abline(color = "firebrick")
# marchmadness TheAndyKatz
ggplot(frequency, aes(marchmadness, TheAndyKatz)) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = scales::percent_format()) +
scale_y_log10(labels = scales::percent_format()) +
geom_abline(color = "firebrick")
word_ratios <- tweets_tokenized %>%
filter(screen_name == "CBBCent1" | screen_name == "Adam_Bradford14") %>%
filter(!str_detect(word, "^@")) %>%
count(word, screen_name) %>%
group_by(word) %>%
filter(sum(n) >= 2) %>%
ungroup() %>%
pivot_wider(names_from = screen_name, values_from = n, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(CBBCent1 / Adam_Bradford14)) %>%
arrange(desc(logratio))
word_ratios
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col() + #show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (CCBCent1/Adam_Bradford14)") +
scale_fill_discrete(name = "", labels = c("CCBCent1", "Adam_Bradford14"))
# make lower case
corpus.prep <- tm_map(dfCorpus, str_to_lower) # stringr::str_to_lower() instead of base::tolower
# remove white space
corpus.prep <- tm_map(corpus.prep, stripWhitespace)
# remove punctuation
corpus.prep <- tm_map(corpus.prep, removePunctuation)
# remove numbers
corpus.prep <- tm_map(corpus.prep, removeNumbers)
# head(stopwords("english"))
# remove stop words
corpus <- tm_map(corpus.prep, removeWords, stopwords("english"))
# docs <- tm_map(docs, removeWords, c("department", "email"))
# stem remaining words
corpus <- tm_map(corpus, stemDocument) # snowballC::StemDocument()
# dtm <- DocumentTermMatrix(docs)
dtm2 <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm2)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
d <- d %>%
slice(2:200)
wordcloud2(d, color = "random-dark", backgroundColor = "orange")
wordcloud2(d, size = 0.3, shape="star", color = "random-light", backgroundColor = 'black', fontFamily="Loma")
# letterCloud(d, word="R", size = 1, fontFamily="Loma", backgroundColor = 'black')
http://antonio-ferraro.eu.pn/word-clouds-in-r-packages-wordcloud2-and-tm/
https://jrnold.github.io/qss-tidy/discovery.html#textual-data
https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html
of less use